***  Program to construct variables for CDS 
* - constructs variables used the analysis
* - introduces some sample selection restrictions 
set more off

log using "$log_files/3_data_cleaning_child_panel", replace

********************************************************************************
*********************** A. READ-IN THE CHILD PANEL *******************************
********************************************************************************

clear
use "$temp/psid_child.dta"

********************************************************************************
*********************** B. INITIAL SAMPLE SELECTION *******************************
********************************************************************************

* restrict to sample mothers ages 16-45 when child was born

replace ind_not_sample=1 if momageatbirth<16 & momageatbirth!=. //no observation deleted 
replace ind_not_sample=1 if momageatbirth>45 & momageatbirth!=. 

************ Year 1997- 2007 ************

replace ind_not_sample=1 if year>2007|year<1997


********************************************************************************
*********************** C. DATA CLEANING AND CONSTRUCTION **********************
********************************************************************************



*********************** Cleaning childcare expenditures **********************

*Set unrealistic responses to missing
replace chcare=. if (chcare>0&chcare<1)
replace chcare=. if  chcare>500

replace chcare_second=. if (chcare_second>0&chcare_second<1)
replace chcare_second=. if  chcare_second>500

replace chcare_hh_pc =. if (chcare_hh_pc>0&chcare_hh_pc<1)
replace chcare_hh_pc =. if chcare_hh_pc>500


*Set second childcare measure cost to zero if the main childcare arrangement cost 
*is reported and the second childcare arrangement cost is missing
replace chcare_second=0 if chcare<.&chcare_second==.


*Generate aggregate child-specific childcare measure
generate chcare_exp=chcare+chcare_second
label variable chcare_exp        "EXPENDITURE ON MAIN AND SECOND CHILDCARE ARRANGEMENT" 

   // weekly measure of HH expenditures on childcare per kid ages 0-12 (avail 2002)
label variable chcare_hh_pc        "HH EXPENDITURE ON CHILDCARE PER CHILD" 


*Generate positive reports indicators 
generate pos_chcare_exp=(chcare_exp>0)
replace pos_chcare_exp=. if chcare_exp==.
label variable pos_chcare_exp  "POSITIVE CHILD-BASED CHILDCARE REPORT (MAIN+SECOND)" 


generate pos_chcare_hh_pc=(chcare_hh_pc>0)
replace pos_chcare_hh_pc=. if chcare_hh_pc==.
label variable pos_chcare_hh_pc  "POSITIVE HH PER CAPITA CHILDCARE REPORT" 


generate pos_chcare=(chcare>0)
replace pos_chcare=. if chcare==.

generate pos_chcare_second=(chcare_second>0)
replace pos_chcare_second=. if chcare_second==.



*Measure of imputed childcare expenditures 
*(combine information from child-specific and average per household: 
*if only one available - use it; if both are available - use the average): 

generate chcare_imp=.
*if both reports are positive: use average
replace chcare_imp=(chcare_exp+chcare_hh_pc)/2 if pos_chcare_exp==1&pos_chcare_hh_pc==1
*if one  is positive, other zero: use positive
replace chcare_imp=chcare_exp if pos_chcare_exp==1&pos_chcare_hh_pc==0
replace chcare_imp=chcare_hh_pc if pos_chcare_hh_pc==1&pos_chcare_exp==0
*if one  is positive, other missing: use positive
replace chcare_imp=chcare_exp if pos_chcare_exp==1&pos_chcare_hh_pc==.
replace chcare_imp=chcare_hh_pc if pos_chcare_hh_pc==1&pos_chcare_exp==.
*if one  is zero, other zero or missing: use zero
replace chcare_imp=0 if pos_chcare_exp==0&pos_chcare_hh_pc==0
replace chcare_imp=0 if pos_chcare_exp==0&pos_chcare_hh_pc==.
replace chcare_imp=0 if pos_chcare_hh_pc==0&pos_chcare_exp==.


gen pos_chcare_imp = (chcare_imp>0 & chcare_imp !=.)
replace pos_chcare_imp=. if chcare_imp==.


generate ln_chcare_exp=log(chcare_exp)
generate ln_chcare_hh_pc=log(chcare_hh_pc)
generate ln_chcare_imp=log(chcare_imp)


*********************** Cleaning time  expenditures**********************
rename tau_m tau_m_tot
rename tau_f tau_f_tot

********************************************************************************
*use social+investment time input measure for summary and regressions

generate  tau_m=tau_m_socinvest
generate tau_f=tau_f_socinvest
********************************************************************************

*Set father's time investment to missing if mothers are not married
replace tau_f=. if curr_married==.|curr_married==0

*Generate indicators for outlier values
gen outlier_tau_m=.
replace outlier_tau_m=0 if tau_m<.
replace outlier_tau_m=1 if tau_m<.&(tau_m<0.25|tau_m>100)

gen outlier_tau_f=.
replace outlier_tau_f=0 if tau_f<.
replace outlier_tau_f=1 if tau_f<.&(tau_f<0.25|tau_f>100)

tab outlier_tau_m
tab outlier_tau_f

gen ln_tau_m = ln(tau_m)  
replace ln_tau_m=. if outlier_tau_m==1  // only keep logs for tau values between .25 and 100

gen ln_tau_f = ln(tau_f) 
replace ln_tau_f=. if outlier_tau_f==1    // only keep logs for tau values between .25 and 100

gen pos_tau_m = (tau_m>0 & tau_m !=.)
replace pos_tau_m=. if tau_m==.

gen pos_tau_f = (tau_f>0 & tau_f !=.)
replace pos_tau_f=. if tau_f==.


*********************** Cleaning goods expenditures **********************

*Generate indicators for outlier values
gen outlier_hhinvest=.
replace outlier_hhinvest=0 if hhinvest<.
replace outlier_hhinvest=1 if hhinvest<.&hhinvest<1
tab outlier_hhinvest

gen ln_hhinvest = ln(hhinvest)                  // = ln(g*p)   
replace ln_hhinvest=. if outlier_hhinvest==1            // only keep logs if HH expenditures > $1/week

gen pos_hhinvest = (hhinvest>0 & hhinvest != .)
replace pos_hhinvest=. if hhinvest==.


*********************** Positive ind earnings and work hours **********************

* indicators for positive earning, hours, goods and time investment (set to missing if measure or hours are missing)

gen pos_earn_m = (m_earn>0 & m_earn != .)
gen pos_earn_f = (f_earn>0 & f_earn != .)
gen pos_hrs_m  = (m_hrs>0 & m_hrs != .)
gen pos_hrs_f  = (f_hrs>0 & f_hrs != .)

replace pos_earn_m=. if m_hrs==.
replace pos_earn_f=. if f_hrs==.
replace pos_hrs_m=.  if m_hrs==.
replace pos_hrs_f=.  if f_hrs==.

gen pos_hrs_mf = pos_hrs_m*pos_hrs_f

*********************** Log relative input price ratios **********************


*Create a weighted average HH investment price measure 
gen p_avg = 0.3*p_serv + .7*p_goods


* create log realtive input price ratios (scale childcare costs from annual into hourly units assuming average full-time care is 33hrs/week x 52 weeks))

gen ln_pratio_4ca = ln(p_yocent_e_cps_cpkt) - ln(p_avg) - ln(33*52)
gen ln_pchcare_4ca = ln(p_yocent_e_cps_cpkt) - ln(33*52)


gen ln_wage_m_rel = ln_wage_m - ln(p_avg) 
gen ln_wage_f_rel = ln_wage_f - ln(p_avg)

gen ln_P4ca_Wm_ratio = ln(p_yocent_e_cps_cpkt) - ln_wage_m - ln(33*52)
gen ln_P4ca_Wf_ratio = ln(p_yocent_e_cps_cpkt) - ln_wage_f - ln(33*52)


*********************** Log input expenditure ratios **********************

gen     tau_mf     = tau_m 
replace tau_mf     = tau_m + tau_f if curr_married==1

gen tau_m_exp = m_wage*tau_m
gen tau_f_exp = f_wage*tau_f

gen     tau_mf_exp = tau_m_exp 
replace tau_mf_exp = tau_m_exp + tau_f_exp if curr_married==1


gen ln_tau_fm_ratio = ln_tau_f - ln_tau_m                              // ln(tau_f/tau_m)
gen ln_tau_fm_exp_ratio = ln_tau_f + ln_wage_f - ln_tau_m - ln_wage_m  // ln(W_f tau_f/ (W_m tau_m))

gen ln_tau_m_g_ratio = ln_tau_m - ln_hhinvest + ln(p_avg)              // mother ln(tau/g)
gen ln_tau_f_g_ratio = ln_tau_f - ln_hhinvest + ln(p_avg)              // father ln(tau/g)

gen ln_tau_m_g_exp_ratio = ln_tau_m + ln_wage_m  - ln_hhinvest         // mother ln(W*tau/(p*g))
gen ln_tau_f_g_exp_ratio = ln_tau_f + ln_wage_f - ln_hhinvest          // father ln(W*tau/(p*g))


gen ln_invratio_imp   = ln_chcare_imp - ln_hhinvest           // ln(PY/(p*g)) -- imputed measure from child-specific & per capita HH measures

gen ln_YP_tauW_m  = ln_chcare_imp - (ln_tau_m + ln_wage_m)             // mother ln(Y*P/W*tau)
gen ln_YP_tauW_f  = ln_chcare_imp - (ln_tau_f + ln_wage_f)             // father ln(Y*P/W*tau)


****************Trandsform children test scores*******************************

generate AP=(ap_std-100)/15
generate LW=(lw_std-100)/15

*Make test scores in 1997 and 2002 available for all years

generate AP_97=AP if year==1997
generate AP_02=AP if year==2002
bys kid: egen ap97=max(AP_97)
bys kid: egen ap02=max(AP_02)

generate LW_97=LW if year==1997
generate LW_02=LW if year==2002
bys kid: egen lw97=max(LW_97)
bys kid: egen lw02=max(LW_02)

drop AP_02 AP_97 LW_02 LW_97

*********** Generate indicator for some time spent with relatives *********** 
*Indicator if more than 1 hour per week spent with relatives

generate ind_relative_present=(relative_present>=1)
replace ind_relative_present=. if relative_present==.

*Indicator for all panel - at least one CDS year of positive time with relatives

bys kid: egen ind_relative_97_07=max(ind_relative_present)


*Generate indicators for same state, same region where born for the head

gen ind_same_state=(state_grow_up==1)
gen ind_same_region= (state_grow_up==1|state_grow_up==2)

replace ind_same_state=. if state_grow_up==.
replace ind_same_region=. if state_grow_up==.

xtset kid year
*Use last year data for non-survey years
replace ind_same_state=L1.ind_same_state if year==1998|year==2000|year==2002|year==2004|year==2006
replace ind_same_region=L1.ind_same_region if year==1998|year==2000|year==2002|year==2004|year==2006


******************************** Make balanced panel********************************
fillin kid year
bys kid: egen MID=max(mid)
sum MID mid

**************** Generate indicator if all prices available ****************
generate ind_price=0
forval y=1997(1)2007{
gen ind_price_`y'_t=0
*Generate indicator in a given year, indicators during other survey years are generated as missing
replace ind_price_`y'_t=1 if  ln_wage_f<. & ln_wage_m<. & p_avg>0 & p_avg<. & p_yocent_e_cps_cpkt>0 & p_yocent_e_cps_cpkt<. & curr_married==1 & year==`y'
replace ind_price=1 if        ln_wage_f<. & ln_wage_m<. & p_avg>0 & p_avg<. & p_yocent_e_cps_cpkt>0 & p_yocent_e_cps_cpkt<. & curr_married==1 & year==`y'

replace ind_price_`y'_t=1 if  ln_wage_m<. & p_avg>0 & p_avg<. & p_yocent_e_cps_cpkt>0 & p_yocent_e_cps_cpkt<. & curr_married==0 & year==`y'
replace ind_price=1       if  ln_wage_m<. & p_avg>0 & p_avg<. & p_yocent_e_cps_cpkt>0 & p_yocent_e_cps_cpkt<. & curr_married==0 & year==`y'

*Generate the indicator for the whole sample for a given kid
bys kid: egen ind_price_`y'=max(ind_price_`y'_t)
drop ind_price_`y'_t
}

*All prices are available between 1997-2001
generate ind_price_97_01=ind_price_1997*ind_price_1998*ind_price_1999*ind_price_2000*ind_price_2001

*All prices are available between 2002-2006
generate ind_price_02_06=ind_price_2002*ind_price_2003*ind_price_2004*ind_price_2005*ind_price_2006

label drop _all

summ ind_price*


* Create indicator for some positive and non-missing CDS investment measure

gen CDS_invest_ind = (tau_m>0 & tau_m!=.)|(tau_f>0 & tau_f!=.)|(hhinvest>0 & hhinvest!=.)|/*
*/ (chcare_imp>0&chcare_imp!=.)

*Replace to zero for non-CDS years
replace CDS_invest_ind=0 if year!=1997&year!=2002&year!=2007

*Indicator: child has at least one non-missing CDS investment measure over 1997,2002, and 2007

bys kid: egen CDS_invest_ind_97_07=max(CDS_invest_ind)


********************************************************************************
*********************** D. SAVE DATA FOR REASONABLY AGED PARENTS****************
********************************************************************************

*Sort dataset before saving
sort kid year
*Save dataset
export delimited using "$data/psid_fam.csv", replace


*Drop observations that do not satisfy sample restrictions 
keep if ind_not_sample==0
save "$data/psid_fam.dta", replace


log close



